#' =============================================================================
#' FILE: 00_lapop_data.R
#' DESCRIPTION:
#'   Processes LAPOP survey data (2008–2018) to analyze party identification
#'   (partisanship) in Latin America. Imports, cleans, and merges country-level
#'   datasets for each survey year, standardizes variable names/values, and
#'   produces a harmonized dataset for analysis.
#'
#' NOTE:
#'   Download the required `.dta` files for each year/country and place them in
#'   the corresponding folders. For more information, see readme_lapop_data.
#'   Data source: https://www.vanderbilt.edu/lapop/
#'   For replication, use the harmonized file generated at the end of this script.
#'
#' PACKAGES REQUIRED: pacman, tidyverse, haven
#'
#' OUTPUTS:
#'   - 04_outputs/lapop/lapop_2008.rds
#'   - 04_outputs/lapop/lapop_2010.rds
#'   - 04_outputs/lapop/lapop_2012.rds
#'   - 04_outputs/lapop/lapop_2014.rds
#'   - 04_outputs/lapop/lapop_2016.rds
#'   - 04_outputs/lapop/lapop_2018.rds
#'   - 04_outputs/lapop_partisanship.rds
#' =============================================================================

# SECTION: Load Required Packages ----------------------------------------------
# Install and load required packages
if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, haven)

# SECTION: Process 2018 LAPOP Data ---------------------------------------------
# Import files
files <- list.files(path = "02_data/lapop/2018/", pattern = ".dta")
dataFiles <- lapply(paste0("02_data/lapop/2018/", files), read_dta)

# Country names for datasets
files <- sub("(^\\w+)\\s.+", "\\1", files)
files <- gsub("[[:digit:]]+", "", files)

# Is the data available for every country?
countries <- rep(NA, length(files))
for (i in seq(dataFiles)) {
  print(i)
  print(files[i])
  print(setdiff(c("pais", "vb10", "wt"), names(dataFiles[[i]])))
  countries[i] <- ifelse(length((setdiff(
    c("pais", "vb10", "wt"),
    names(dataFiles[[i]])
  ))) == 0,
  TRUE,
  FALSE
  )
} # United states and Canada do not have partisanship

# Delete US and Canada
dataFiles <- dataFiles[c(countries)]
files <- files[countries]

# Extract relevant variables
for (i in seq(dataFiles)) {
  assign(files[[i]], dataFiles[[i]][, c(
    "pais", # country
    "vb10", # Party ID
    "wt" # weight
  )])
}

# Create dataframe
dfs <- sapply(.GlobalEnv, is.data.frame)
df <- do.call(bind_rows, mget(names(dfs)[dfs]))

# Remove objects except df
rm(list = setdiff(ls(), "df"))

# Labels to values.
df$pais <- as.character(as_factor(df$pais))
df$vb10 <- as.character(as_factor(df$vb10))

# Add year variable
df$year <- 2018

# Save processed data
saveRDS(df, "04_outputs/lapop/lapop_2018.rds")
rm(df)

# SECTION: Process 2016 LAPOP Data ---------------------------------------------
# Import files
files <- list.files(path = "02_data/lapop/2016/", pattern = ".dta")
dataFiles <- lapply(paste0("02_data/lapop/2016/", files), read_dta)

# Country names for datasets
files <- sub("(^\\w+)\\s.+", "\\1", files)
files <- gsub("[[:digit:]]+", "", files)

# Is the data available for every country?
countries <- rep(NA, length(files))

for (i in seq(dataFiles)) {
  print(i)
  print(files[i])
  print(setdiff(c("pais", "vb10", "wt"), names(dataFiles[[i]])))
  countries[i] <- ifelse(length((setdiff(
    c("pais", "vb10", "wt"),
    names(dataFiles[[i]])
  ))) == 0,
  TRUE,
  FALSE
  )
}

# Delete StLucia, StKittsNevis, Antigua, US, StVincent, Grenada, Dominica, Can.
dataFiles <- dataFiles[countries]
files <- files[countries]

# Extract
for (i in seq(dataFiles)) {
  assign(files[[i]], dataFiles[[i]][, c(
    "pais", # country
    "vb10", # Party ID
    "wt" # weight
  )])
}

# Create dataframe
dfs <- sapply(.GlobalEnv, is.data.frame)
df <- do.call(bind_rows, mget(names(dfs)[dfs]))

# Remove objects
rm(list = setdiff(ls(), "df"))

# Labels to values.
df$pais <- as.character(as_factor(df$pais))
df$vb10 <- as.character(as_factor(df$vb10))

# Year
df$year <- 2016

# Save
saveRDS(df, "04_outputs/lapop/lapop_2016.rds")
rm(df)

# SECTION: Process 2014 LAPOP Data ---------------------------------------------
# Import files
files <- list.files(path = "02_data/lapop/2014/", pattern = ".dta")
dataFiles <- lapply(paste0("02_data/lapop/2014/", files), read_dta)

# Country names for datasets
files <- sub("(^\\w+)\\s.+", "\\1", files)
files <- gsub("[[:digit:]]+", "", files)

# Is the data available for every country?
countries <- rep(NA, length(files))

for (i in seq(dataFiles)) {
  print(i)
  print(files[i])
  print(setdiff(c("pais", "vb10", "wt"), names(dataFiles[[i]])))
  countries[i] <- ifelse(length((setdiff(
    c("pais", "vb10", "wt"),
    names(dataFiles[[i]])
  ))) == 0,
  TRUE,
  FALSE
  )
}

# Delete StLucia, StKittsNevis, Antigua, US, StVincent, Grenada, Dominica, Can.
dataFiles <- dataFiles[countries]
files <- files[countries]

# Extract
for (i in seq(dataFiles)) {
  assign(files[[i]], dataFiles[[i]][, c(
    "pais", # country
    "vb10", # Party ID
    "wt" # weight
  )])
}

# Create dataframe
dfs <- sapply(.GlobalEnv, is.data.frame)
df <- do.call(bind_rows, mget(names(dfs)[dfs]))

# Remove objects
rm(list = setdiff(ls(), "df"))

# Labels to values.
df$pais <- as.character(as_factor(df$pais))
df$vb10 <- as.character(as_factor(df$vb10))

# Year
df$year <- 2014

# Save
saveRDS(df, "04_outputs/lapop/lapop_2014.rds")
rm(df)

# SECTION: Process 2012 LAPOP Data ---------------------------------------------
# Import files
files <- list.files(path = "02_data/lapop/2012/", pattern = ".dta")
dataFiles <- lapply(paste0("02_data/lapop/2012/", files), read_dta)

# Country names for datasets
files <- sub("(^\\w+)\\s.+", "\\1", files)
files <- gsub("[[:digit:]]+", "", files)

# Is the data available for every country?
countries <- rep(NA, length(files))

for (i in seq(dataFiles)) {
  print(i)
  print(files[i])
  print(setdiff(c("pais", "vb10", "wt"), names(dataFiles[[i]])))
  countries[i] <- ifelse(length((setdiff(
    c("pais", "vb10", "wt"),
    names(dataFiles[[i]])
  ))) == 0,
  TRUE,
  FALSE
  )
}

# Delete StLucia, StKittsNevis, Antigua, US, StVincent, Grenada, Dominica, Can.
dataFiles <- dataFiles[countries]
files <- files[countries]

# Extract
for (i in seq(dataFiles)) {
  assign(files[[i]], dataFiles[[i]][, c(
    "pais", # country
    "vb10", # Party ID
    "wt" # weight
  )])
}

# Create dataframe
dfs <- sapply(.GlobalEnv, is.data.frame)
df <- do.call(bind_rows, mget(names(dfs)[dfs]))

# Remove objects
rm(list = setdiff(ls(), "df"))

# Labels to values.
df$pais <- as.character(as_factor(df$pais))
df$vb10 <- as.character(as_factor(df$vb10))

# Year
df$year <- 2012

# Save
saveRDS(df, "04_outputs/lapop/lapop_2012.rds")
rm(df)

# SECTION: Process 2010 LAPOP Data ---------------------------------------------
# Import files
files <- list.files(path = "02_data/lapop/2010/", pattern = ".dta")
dataFiles <- lapply(paste0("02_data/lapop/2010/", files), read_dta)

# Country names for datasets
files <- sub("(^\\w+)\\s.+", "\\1", files)
files <- gsub("[[:digit:]]+", "", files)
files <- gsub("_LAPOP_AmericasBarometer", "", files)

# Is the data available for every country?
countries <- rep(NA, length(files))

for (i in seq(dataFiles)) {
  print(i)
  print(files[i])
  print(setdiff(c("pais", "vb10"), names(dataFiles[[i]])))
  countries[i] <- ifelse(length((setdiff(
    c("pais", "vb10"), # no weights
    names(dataFiles[[i]])
  ))) == 0,
  TRUE,
  FALSE
  )
}

# Delete
dataFiles <- dataFiles[countries]
files <- files[countries]

# Extract
for (i in seq(dataFiles)) {
  assign(files[[i]], dataFiles[[i]][, c(
    "pais", # country
    "vb10" # no weights
  )])
}

# Create dataframe
dfs <- sapply(.GlobalEnv, is.data.frame)
df <- do.call(bind_rows, mget(names(dfs)[dfs]))

# Remove objects
rm(list = setdiff(ls(), "df"))

# Labels to values.
df$pais <- as.character(as_factor(df$pais))
df$vb10 <- as.character(as_factor(df$vb10))

# Year
df$year <- 2010

# Save
saveRDS(df, "04_outputs/lapop/lapop_2010.rds")
rm(df)

# SECTION: Process 2008 LAPOP Data ---------------------------------------------
# Import files
files <- list.files(path = "02_data/lapop/2008/", pattern = ".dta")
dataFiles <- lapply(paste0("02_data/lapop/2008/", files), read_dta)

# Country names for datasets
files <- sub("(^\\w+)\\s.+", "\\1", files)
files <- gsub("[[:digit:]]+", "", files)
files <- gsub("\\_lapop.*", "", files)

# Is the data available for every country?
names(dataFiles[[22]])[2] <- "pais" # issue with bolivia
countries <- rep(NA, length(files))

for (i in seq(dataFiles)) {
  print(i)
  print(files[i])
  print(setdiff(c("pais", "vb10"), names(dataFiles[[i]])))
  countries[i] <- ifelse(length((setdiff(
    c("pais", "vb10"),
    names(dataFiles[[i]])
  ))) == 0,
  TRUE,
  FALSE
  )
}

# Delete StLucia, StKittsNevis, Antigua, US, StVincent, Grenada, Dominica, Can.
dataFiles <- dataFiles[countries]
files <- files[countries]

# Extract
for (i in seq(dataFiles)) {
  assign(files[[i]], dataFiles[[i]][, c(
    "pais", # country
    "vb10" # Party ID
  )])
}

# Create dataframe
dfs <- sapply(.GlobalEnv, is.data.frame)
df <- do.call(bind_rows, mget(names(dfs)[dfs]))

# Remove objects
rm(list = setdiff(ls(), "df"))

# Labels to values.
df$pais <- as.character(as_factor(df$pais))
df$vb10 <- as.character(as_factor(df$vb10))

# Year
df$year <- 2008

# Save
saveRDS(df, "04_outputs/lapop/lapop_2008.rds")
rm(df)

# SECTION: Append All Years and Harmonize --------------------------------------
df <- do.call("bind_rows", list(
  readRDS("04_outputs/lapop/lapop_2018.rds"),
  readRDS("04_outputs/lapop/lapop_2016.rds"),
  readRDS("04_outputs/lapop/lapop_2014.rds"),
  readRDS("04_outputs/lapop/lapop_2012.rds"),
  readRDS("04_outputs/lapop/lapop_2010.rds"),
  readRDS("04_outputs/lapop/lapop_2008.rds")
))

# Country
df$pais <- recode(df$pais,
  "Belice" = "Belize",
  "belize" = "Belize",
  "Brasil" = "Brazil",
  "Canadá" = "Canada",
  "República Dominicana" = "Dominican Republic",
  "REPUBLICA DOMINICANA" = "Dominican Republic",
  "Estados Unidos" = "United States",
  "Haití" = "Haiti",
  "México" = "Mexico",
  "Panamá" = "Panama",
  "Perú" = "Peru",
  "Surinam" = "Suriname",
  "Trinidad y Tobago" = "Trinidad & Tobago",
  "argentina" = "Argentina",
  "belice" = "Belize",
  "bolivia" = "Bolivia",
  "brasil" = "Brazil",
  "brazil" = "Brazil",
  "canadá" = "Canada",
  "chile" = "Chile",
  "colombia" = "Colombia",
  "costa rica" = "Costa Rica",
  "dominican republic" = "Dominican Republic",
  "ecuador" = "Ecuador",
  "el salvador" = "El Salvador",
  "estados unidos" = "United States",
  "guatemala" = "Guatemala",
  "guyana" = "Guyana",
  "haiti" = "Haiti",
  "haití" = "Haiti",
  "honduras" = "Honduras",
  "jamaica" = "Jamaica",
  "mexico" = "Mexico",
  "méxico" = "Mexico",
  "nicaragua" = "Nicaragua",
  "panama" = "Panama",
  "panamá" = "Panama",
  "paraguay" = "Paraguay",
  "peru" = "Peru",
  "perú" = "Peru",
  "república dominicana" = "Dominican Republic",
  "republica dominicana" = "Dominican Republic",
  "surinam" = "Suriname",
  "trinidad & tobago" = "Trinidad & Tobago",
  "uraguay" = "Uruguay",
  "uruguay" = "Uruguay",
  "venezuala" = "Venezuela",
  "venezuela" = "Venezuela"
)

# Latin America
df$la <- ifelse(df$pais != "Bahamas" &
  df$pais != "Barbados" &
  df$pais != "Belize" &
  df$pais != "Canada" &
  df$pais != "Guyana" &
  df$pais != "Jamaica" &
  df$pais != "Suriname" &
  df$pais != "Trinidad & Tobago" &
  df$pais != "United States", 1, 0)

# Only Latin America
df <- df[df$la == 1, ]

# Partisanship
df$partisanship <- ifelse(df$vb10 == "Yes" |
                            df$vb10 == "Sí" | 
                            df$vb10 == "Si" |
                            df$vb10 == "sí", 1, 0)
df$partisanship[is.na(df$vb10)] <- 0

# Save
saveRDS(df, "04_outputs/lapop_partisanship.rds")